/**
* Copyright (c) 2025 Huawei Technologies Co., Ltd.
* This program is free software, you can redistribute it and/or modify it under the terms and conditions of
* CANN Open Software License Agreement Version 2.0 (the "License").
* Please refer to the License for details. You may not use this file except in compliance with the License.
* THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
* INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
* See LICENSE in the root of the software repository for the full text of the License.
*/

/*!
 * \file kernel_tensor_impl.h
 * \brief
 */
#ifndef ASCENDC_MODULE_TENSOR_IMPL_H
#define ASCENDC_MODULE_TENSOR_IMPL_H
#include "kernel_tensor.h"
#include "kernel_operator_symbol_override_impl.h"

namespace AscendC {
template <typename Shape, size_t... Is>
__aicore__ inline constexpr auto ProdImpl(const Shape& t, Std::index_sequence<Is...>) {
    return (Std::get<Is>(t) * ... * 1);
}

template <typename... Args>
__aicore__ inline constexpr auto Prod(const Shape<Args...>& t) {
    return ProdImpl(t, Std::make_index_sequence<sizeof...(Args)>{});
}

// CPU Impl
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
inline uint8_t* GetBaseAddrCpu(int8_t logicPos);
#endif
#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1
template <typename T> LocalTensor<T>::LocalTensor(TBuffAddr& address)
{
    this->address_ = address;
}

template <typename T> LocalTensor<T>::LocalTensor(const LocalTensor<T>& other)
{
    os_.str("");
    this->address_ = other.address_;
#ifndef SPLIT_CORE_CUBE
    if constexpr (IsSameType<PrimType, T>::value) {
        this->shapeInfo_ = other.shapeInfo_;
    }
#endif
    if constexpr (is_tensorTrait_v<T>) {
        this->SetTensorTrait(other.GetTensorTrait());
    }
}

template <typename T> LocalTensor<T> LocalTensor<T>::operator = (const LocalTensor<T>& other)
{
    if (this != &other) {
        os_.str("");
        this->address_ = other.address_;
#ifndef SPLIT_CORE_CUBE
        if constexpr (IsSameType<PrimType, T>::value) {
            this->shapeInfo_ = other.shapeInfo_;
        }
#endif
    }
    if constexpr (is_tensorTrait_v<T>) {
        this->SetTensorTrait(other.GetTensorTrait());
    }
    return *this;
}

template <typename T>
typename LocalTensor<T>::PrimType* LocalTensor<T>::GetPhyAddr(const uint32_t offset) const
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
#else
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
#endif
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT4_TWO > (offset / INT4_TWO)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT4_TWO)));
        ASCENDC_DEBUG_ASSERT((offset % INT4_TWO == 0),
            KERNEL_LOG(KERNEL_ERROR, "The offset for int4b_t GetPhyAddr should be an even num."));
        return reinterpret_cast<PrimType *>(this->address_.absAddr) + offset / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT2_FOUR > (offset / INT2_FOUR)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT2_FOUR)));
        ASCENDC_DEBUG_ASSERT((offset % INT2_FOUR == 0),
            KERNEL_LOG(KERNEL_ERROR, "The offset for int2b_t GetPhyAddr should be multiples of 4."));
        return reinterpret_cast<PrimType *>(this->address_.absAddr) + offset / INT2_FOUR;
#endif
    } else {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset * sizeof(PrimType))),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen / sizeof(PrimType))));
        return reinterpret_cast<PrimType*>(this->address_.absAddr) + offset;
    }
}
template <typename T> typename LocalTensor<T>::PrimType* LocalTensor<T>::GetPhyAddr() const
{
    return GetPhyAddr(0);
}

template <typename T>
__inout_pipe__(S) typename LocalTensor<T>::PrimType LocalTensor<T>::GetValue(const uint32_t offset) const
{
    if ASCEND_IS_AIC {
        if (GetPhyType(AscendC::TPosition(this->GetPosition())) == Hardware::UB) {
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
            if constexpr (SupportType<PrimType, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
                PrimType ret;
                return ret;
            } else {
                return PrimType(0);
            }
#else
            return 0;
#endif
        }
    }
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT4_TWO > (offset / INT4_TWO)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT4_TWO)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t val = tmp.GetValue(offset / INT4_TWO);
        return static_cast<int4b_t>(val >> (4 * (offset % INT4_TWO)));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT2_FOUR > (offset / INT2_FOUR)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT2_FOUR)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t val = tmp.GetValue(offset / INT2_FOUR);
        return static_cast<int2b_t>(val >> (2 * (offset % INT2_FOUR)));
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * ConstantsInternal::ASCENDC_B4_TWO > (offset / ConstantsInternal::ASCENDC_B4_TWO)),
            KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
                static_cast<uint32_t>(this->address_.dataLen * ConstantsInternal::ASCENDC_B4_TWO)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t val = tmp.GetValue(offset / ConstantsInternal::ASCENDC_B4_TWO);
        PrimType ret;
        ret.val = static_cast<uint8_t>((val >> (ConstantsInternal::ASCENDC_B4_BIT_NUM * (offset % ConstantsInternal::ASCENDC_B4_TWO))) & 0xf);
        return ret;
    } else if constexpr (SupportType<PrimType, complex32>()) {
        LocalTensor<uint32_t> tmp = this->ReinterpretCast<uint32_t>();
        uint32_t val = tmp.GetValue(offset);
        return *(reinterpret_cast<PrimType*>(&val));
    }  else if constexpr (SupportType<PrimType, complex64>()) {
        LocalTensor<uint64_t> tmp = this->ReinterpretCast<uint64_t>();
        uint64_t val = tmp.GetValue(offset);
        return *(reinterpret_cast<PrimType*>(&val));
#endif
    } else {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset * sizeof(PrimType))),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen / sizeof(PrimType))));
        return *(GetPhyAddr(offset));
    }
}

template <typename T>
__inout_pipe__(S) typename LocalTensor<T>::PrimType& LocalTensor<T>::operator()(const uint32_t offset) const
{
    ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset * sizeof(PrimType))),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen / sizeof(PrimType))));
    return *(GetPhyAddr(offset));
}
template <typename T>
template <typename U> __aicore__ inline LocalTensor<U> LocalTensor<T>::ReinterpretCast() const
{
    LocalTensor<U> output;
    output.address_.logicPos = static_cast<uint8_t>(this->GetPosition());
    output.address_.bufferHandle = this->GetBufferHandle();
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
#else
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
#endif
        output.address_.dataLen = this->GetSize() / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        output.address_.dataLen = this->GetSize() / INT2_FOUR;
#endif
    } else {
        output.address_.dataLen = this->GetSize() * sizeof(PrimType);
    }
    output.address_.bufferAddr = this->address_.bufferAddr;
    output.address_.absAddr = this->address_.absAddr;
    if constexpr (is_tensorTrait_v<T> && is_tensorTrait_v<U>) {
        output.GetTensorTrait().SetLayout(this->GetTensorTrait().GetLayout());
    }
    return output;
}

template <typename T>
template <typename U> __inout_pipe__(S) void LocalTensor<T>::SetValue(const uint32_t index, const U value) const
{
    if ASCEND_IS_AIC {
        if (GetPhyType(AscendC::TPosition(this->GetPosition())) == Hardware::UB) {
            return;
        }
    }
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT4_TWO > (index / INT4_TWO)),
        KERNEL_LOG(KERNEL_ERROR, "index is %u, which can not be larger than data len %u", index,
            static_cast<uint32_t>(this->address_.dataLen * INT4_TWO)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t shift = (index % INT4_TWO == 0)? 0 : 4;
        uint32_t idx = index / INT4_TWO;
        uint8_t val = tmp.GetValue(idx) & (0xf << (INT4_BIT_NUM - shift));
        tmp.SetValue(idx, val + (value.storage << shift));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * INT2_FOUR > (index / INT2_FOUR)),
        KERNEL_LOG(KERNEL_ERROR, "index is %u, which can not be larger than data len %u", index,
            static_cast<uint32_t>(this->address_.dataLen * INT2_FOUR)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t shift = (index % INT2_FOUR) * 2;
        uint32_t idx = index / INT2_FOUR;
        uint8_t mask = (0x3) << shift;
        uint8_t maskedVal = tmp.GetValue(idx) & (~mask);
        tmp.SetValue(idx, maskedVal + (value.storage << shift));
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen * ConstantsInternal::ASCENDC_B4_TWO > (index / ConstantsInternal::ASCENDC_B4_TWO)),
            KERNEL_LOG(KERNEL_ERROR, "index is %u, which can not be larger than data len %u", index,
                static_cast<uint32_t>(this->address_.dataLen * ConstantsInternal::ASCENDC_B4_TWO)));

        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t shift = (index % ConstantsInternal::ASCENDC_B4_TWO == 0)? 0 : ConstantsInternal::ASCENDC_B4_BIT_NUM;
        uint32_t idx = index / ConstantsInternal::ASCENDC_B4_TWO;
        uint8_t val = tmp.GetValue(idx) & (0xf << (ConstantsInternal::ASCENDC_B4_BIT_NUM - shift));
        tmp.SetValue(idx, val + (value << shift));
#endif
    } else {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (index * sizeof(PrimType))),
        KERNEL_LOG(KERNEL_ERROR, "index is %u, which can not be larger than data len %u", index,
            static_cast<uint32_t>(this->address_.dataLen / sizeof(PrimType))));
        *(GetPhyAddr(index)) = PrimType(value);
    }
}

template <typename T> LocalTensor<T> LocalTensor<T>::operator[](const uint32_t offset) const
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
#else
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
#endif
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset / INT4_TWO)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT4_TWO)));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset / INT2_FOUR)),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen * INT2_FOUR)));
#endif
    } else {
        ASCENDC_DEBUG_ASSERT((this->address_.dataLen > (offset * sizeof(PrimType))),
        KERNEL_LOG(KERNEL_ERROR, "offset is %u, which can not be larger than data len %u", offset,
            static_cast<uint32_t>(this->address_.dataLen / sizeof(PrimType))));
    }

    LocalTensor result = *this;
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
#else
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
#endif
        result.address_.dataLen -= (offset / INT4_TWO);
        result.address_.absAddr = result.address_.absAddr + offset / INT4_TWO;
        result.address_.bufferAddr = result.address_.bufferAddr + offset / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        result.address_.dataLen -= (offset / INT2_FOUR);
        result.address_.absAddr = result.address_.absAddr + offset / INT2_FOUR;
        result.address_.bufferAddr = result.address_.bufferAddr + offset / INT2_FOUR;
#endif
    } else {
        result.address_.dataLen -= (offset * sizeof(PrimType));
        result.address_.absAddr = result.address_.absAddr + offset * sizeof(PrimType);
        result.address_.bufferAddr = result.address_.bufferAddr + offset * sizeof(PrimType);
    }
    result.os_.str("");
    return result;
}

template <typename T>
template <typename U>
[[deprecated("NOTICE: SetAddrWithOffset has been deprecated and will be removed in the next version. "
    "Please do not use it!")]]
void LocalTensor<T>::SetAddrWithOffset(LocalTensor<U> &src, uint32_t offset)
{
    this->address_ = src.address_;
    this->address_.bufferAddr += offset * sizeof(PrimT<U>);
    this->address_.absAddr += offset * sizeof(PrimT<U>);
}

#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template<typename T>
inline std::string GetComplexStr(const T& val)
{
    static constexpr uint16_t all16BitsOne = 0xFFFF;
    static constexpr uint32_t all32BitsOne = 0xFFFFFFFF;
    static_assert(SupportType<T, complex32, complex64>(), "Unsupported complex type.");
    std::string str = "";
    if constexpr(std::is_same_v<T, complex32>) {
        if (val.imag.ToFloat() > 0 || !(*(reinterpret_cast<const uint16_t*>(&val.imag)) & all16BitsOne)) {
            str += std::to_string(val.real.ToFloat()) + "+" + std::to_string(val.imag.ToFloat()) + "j ";
        } else {
            str += std::to_string(val.real.ToFloat()) + std::to_string(val.imag.ToFloat()) + "j ";
        }
    } else {
        if (val.imag > 0 || !(*(reinterpret_cast<const uint32_t*>(&val.imag)) & all32BitsOne)) {
            str += std::to_string(val.real) + "+" + std::to_string(val.imag) + "j ";
        } else {
            str += std::to_string(val.real) + std::to_string(val.imag) + "j ";
        }
    }
    return str;
}
#endif

template <typename T>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
    "it!")]]
inline void LocalTensor<T>::Print(uint32_t len)
{
    if constexpr (IsSameType<PrimType, half>::value) {
        PrintTypicalFloat(len, sizeof(half));
        return;
    }
#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3002) || (__NPU_ARCH__ == 3102) ||   \
    (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (IsSameType<PrimType, bfloat16_t>::value) {
        PrintTypicalFloat(len, sizeof(bfloat16_t));
        return;
    }
    if constexpr (!IsSameType<PrimType, half>::value && !IsSameType<PrimType, bfloat16_t>::value) {
#else
    if constexpr (!IsSameType<PrimType, half>::value) {
#endif
        os_.str("");
        uint32_t printLen = std::min(len, GetSize());
        uint32_t blockNum = ONE_BLK_SIZE / sizeof(PrimType);
        uint32_t rowNum = printLen / blockNum;
        uint32_t residualNum = printLen % blockNum;
        const int32_t width = 4;
        for (uint32_t i = 0; i < rowNum; i++) {
            os_ << std::setw(width) << std::setfill('0') << i * blockNum << " : ";
            for (uint32_t j = 0; j < blockNum; j++) {
                if constexpr ((sizeof(PrimType) == sizeof(int8_t)) || (sizeof(PrimType) == sizeof(bool))) {
                    os_ << static_cast<int32_t>(GetValue(i * blockNum + j)) << " ";
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
                } else if constexpr (std::is_same_v<PrimType, fp8_e4m3fn_t> || std::is_same_v<PrimType, fp8_e5m2_t> ||
                                     std::is_same_v<PrimType, hifloat8_t>) {
                    os_ << GetValue(i * blockNum + j).ToFloat() << " ";
                } else if constexpr (std::is_same_v<PrimType, complex32> || std::is_same_v<PrimType, complex64>) {
                    PrimType val = GetValue(i * blockNum + j);
                    os_ << GetComplexStr(val);
#endif
                } else {
                    os_ << GetValue(i * blockNum + j) << " ";
                }
            }
            os_ << std::endl;
        }
        if (residualNum != 0) {
            os_ << std::setw(width) << std::setfill('0') << rowNum * blockNum << " : ";
            for (uint32_t i = 0; i < residualNum; i++) {
                if constexpr ((sizeof(PrimType) == sizeof(int8_t)) || (sizeof(PrimType) == sizeof(bool))) {
                    os_ << static_cast<int32_t>(GetValue(rowNum * blockNum + i)) << " ";
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
                } else if constexpr (std::is_same_v<PrimType, fp8_e4m3fn_t> || std::is_same_v<PrimType, fp8_e5m2_t> ||
                                     std::is_same_v<PrimType, hifloat8_t>) {
                    os_ << GetValue(rowNum * blockNum + i).ToFloat() << " ";
                } else if constexpr (std::is_same_v<PrimType, complex32> || std::is_same_v<PrimType, complex64>) {
                    PrimType val = GetValue(rowNum * blockNum + i);
                    os_ << GetComplexStr(val);
#endif
                } else {
                    os_ << GetValue(rowNum * blockNum + i) << " ";
                }
            }
            os_ << std::endl;
        }
        std::cout << os_.str();
    }
}

template <typename T> inline void LocalTensor<T>::PrintTypicalFloat(uint32_t len, uint32_t dataSize)
{
    os_.str("");
    uint32_t printLen = std::min(len, GetSize());
    uint32_t blockNum = ONE_BLK_SIZE / dataSize;
    uint32_t rowNum = printLen / blockNum;
    uint32_t residualNum = printLen % blockNum;
    const int32_t width = 4;
    for (uint32_t i = 0; i < rowNum; i++) {
        os_ << std::setw(width) << std::setfill('0') << i * blockNum << " : ";
        for (uint32_t j = 0; j < blockNum; j++) {
            os_ << GetValue(i * blockNum + j).ToFloat() << " ";
        }
        os_ << std::endl;
    }
    if (residualNum != 0) {
        os_ << std::setw(width) << std::setfill('0') << rowNum * blockNum << " : ";
        for (uint32_t i = 0; i < residualNum; i++) {
            os_ << GetValue(rowNum * blockNum + i).ToFloat() << " ";
        }
        os_ << std::endl;
    }
    std::cout << os_.str();
}

#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T> inline void LocalTensor<T>::PrintFp4E1M2(uint32_t len) const
{
    std::ostringstream os;
    os.str("");
    uint32_t printLen = std::min(len, GetSize());
    constexpr uint32_t blockNum = ONE_BLK_SIZE * ConstantsInternal::ASCENDC_B4_TWO;
    uint32_t rowNum = printLen / blockNum;
    uint32_t residualNum = printLen % blockNum;
    const int32_t width = 4;
    for (uint32_t i = 0; i < rowNum; i++) {
        os << std::setw(width) << std::setfill('0') << i * blockNum << " : ";
        for (uint32_t j = 0; j < blockNum; j++) {
            fp4x2_e1m2_t fp4Val = GetValue(i * blockNum + j);
            bfloat16_t newbfval = bfloat16::Bf16T(fp4Val);
            os << newbfval.ToFloat() << " ";
        }
        os << std::endl;
    }
    if (residualNum != 0) {
        os << std::setw(width) << std::setfill('0') << rowNum * blockNum << " : ";
        for (uint32_t i = 0; i < residualNum; i++) {
            fp4x2_e1m2_t fp4Val = GetValue(rowNum * blockNum + i);
            bfloat16_t newbfval = bfloat16::Bf16T(fp4Val);
            os << newbfval.ToFloat() << " ";
        }
        os << std::endl;
    }
    std::cout << os.str();
}

template <typename T> inline void LocalTensor<T>::PrintFp4E2M1(uint32_t len) const
{
    std::ostringstream os;
    os.str("");
    uint32_t printLen = std::min(len, GetSize());
    constexpr uint32_t blockNum = ONE_BLK_SIZE * ConstantsInternal::ASCENDC_B4_TWO;
    uint32_t rowNum = printLen / blockNum;
    uint32_t residualNum = printLen % blockNum;
    const int32_t width = 4;
    for (uint32_t i = 0; i < rowNum; i++) {
        os << std::setw(width) << std::setfill('0') << i * blockNum << " : ";
        for (uint32_t j = 0; j < blockNum; j++) {
            fp4x2_e2m1_t fp4Val = GetValue(i * blockNum + j);
            bfloat16_t newbfval = bfloat16::Bf16T(fp4Val);
            os << newbfval.ToFloat() << " ";
        }
        os << std::endl;
    }
    if (residualNum != 0) {
        os << std::setw(width) << std::setfill('0') << rowNum * blockNum << " : ";
        for (uint32_t i = 0; i < residualNum; i++) {
            fp4x2_e2m1_t fp4Val = GetValue(rowNum * blockNum + i);
            bfloat16_t newbfval = bfloat16::Bf16T(fp4Val);
            os << newbfval.ToFloat() << " ";
        }
        os << std::endl;
    }
    std::cout << os.str();
}
#endif

template <>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
    "it!")]]
inline void LocalTensor<half>::Print(uint32_t len)
{
    PrintTypicalFloat(len, sizeof(half));
}

#if (__NPU_ARCH__ == 2201) || (__NPU_ARCH__ == 3002) || (__NPU_ARCH__ == 3102) ||   \
    (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
"it!")]]
inline void LocalTensor<bfloat16_t>::Print(uint32_t len)
{
    PrintTypicalFloat(len, sizeof(bfloat16_t));
}
#endif

#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
"it!")]]
inline void LocalTensor<fp4x2_e2m1_t>::Print(uint32_t len)
{
    PrintFp4E2M1(len);
}

template <>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
"it!")]]
inline void LocalTensor<fp4x2_e1m2_t>::Print(uint32_t len)
{
    PrintFp4E1M2(len);
}
#endif

template <typename T>
[[deprecated("NOTICE: Print has been deprecated and will be removed in the next version. Please do not use "
"it!")]]
inline void LocalTensor<T>::Print()
{
    Print(GetSize());
}

template <typename T> LocalTensor<T>::~LocalTensor() {}

template <typename T>
[[deprecated("NOTICE: ToFile has been deprecated and will be removed in the next version. Please do not use "
"it!")]]
int32_t LocalTensor<T>::ToFile(const std::string &fileName) const
{
    return TensorWriteFile(fileName, reinterpret_cast<const PrimType *>(GetPhyAddr()), GetSize() * sizeof(PrimType));
}
// Npu Impl
#else
template <typename T> __aicore__ inline uint64_t LocalTensor<T>::GetPhyAddr() const
{
    return GetPhyAddr(0);
}
template <typename T> __aicore__ inline uint64_t LocalTensor<T>::GetPhyAddr(const uint32_t offset) const
{
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
        return this->address_.bufferAddr + offset / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        return this->address_.bufferAddr + offset / INT2_FOUR;
#endif
    } else {
        return this->address_.bufferAddr + offset * sizeof(PrimType);
    }
}
template <typename T> __aicore__ inline __inout_pipe__(S)
    typename LocalTensor<T>::PrimType LocalTensor<T>::GetValue(const uint32_t index) const
{
    if ASCEND_IS_AIC {
        if (GetPhyType(AscendC::TPosition(this->GetPosition())) == Hardware::UB) {
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
            if constexpr (SupportType<PrimType, fp4x2_e2m1_t, fp4x2_e1m2_t, fp8_e4m3fn_t, fp8_e5m2_t, hifloat8_t>()) {
                PrimType ret;
                return ret;
            } else {
                return PrimType(0);
            }
#else
            return PrimType(0);
#endif
        }
    }
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t val = tmp.GetValue(index / INT4_TWO);
        return static_cast<int4b_t>(val >> (INT4_BIT_NUM * (index % INT4_TWO)));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t val = tmp.GetValue(index / INT2_FOUR);
        return static_cast<int2b_t>(val >> (INT2_BIT_NUM * (index % INT2_FOUR)));
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, complex32>()) {
        LocalTensor<uint32_t> tmp = this->ReinterpretCast<uint32_t>();
        uint32_t val = tmp.GetValue(index);
        return *(reinterpret_cast<PrimType*>(&val));
    }  else if constexpr (SupportType<PrimType, complex64>()) {
        LocalTensor<uint64_t> tmp = this->ReinterpretCast<uint64_t>();
        uint64_t val = tmp.GetValue(index);
        return *(reinterpret_cast<PrimType*>(&val));
#endif
    } else {
        return *(reinterpret_cast<__ubuf__ PrimType*>(GetPhyAddr(index)));
    }
}
template <typename T> __aicore__ inline __inout_pipe__(S)
    __ubuf__ typename LocalTensor<T>::PrimType& LocalTensor<T>::operator()(const uint32_t offset) const
{
    return *(reinterpret_cast<__ubuf__ PrimType*>(GetPhyAddr(offset)));
}

template <typename T>
template <typename U> __aicore__ inline __sync_alias__ LocalTensor<U> LocalTensor<T>::ReinterpretCast() const
{
    LocalTensor<U> output;
    output.address_.logicPos = static_cast<uint8_t>(this->GetPosition());
    output.address_.bufferHandle = this->GetBufferHandle();
    if constexpr (IsHalfByteDataType<PrimType>()) {
        output.address_.dataLen = this->GetSize() / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr(IsSameType<PrimType, int2b_t>::value) {
        output.address_.dataLen = this->GetSize() / INT2_FOUR;
#endif
    } else {
        output.address_.dataLen = this->GetSize() * sizeof(PrimType);
    }
    output.address_.bufferAddr = this->address_.bufferAddr;
    if constexpr (is_tensorTrait_v<T> && is_tensorTrait_v<U>) {
        output.SetTensorTrait(this->GetTensorTrait());
    }
    return output;
}

template <typename T>
template <typename U> __aicore__ inline __inout_pipe__(S)
    void LocalTensor<T>::SetValue(const uint32_t index, const U value) const
{
    if ASCEND_IS_AIC {
        if (GetPhyType(AscendC::TPosition(this->GetPosition())) == Hardware::UB) {
            return;
        }
    }
    if constexpr (IsSameType<PrimType, int4b_t>::value) {
        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t mask = (index % INT4_TWO == 0)? 0xf0 : 0xf;
        uint32_t idx = index / INT4_TWO;
        uint8_t val = tmp.GetValue(idx) & mask;
        uint8_t shift = (index % INT4_TWO == 0)? 0 : INT4_BIT_NUM;
        tmp.SetValue(idx, val + (value.storage << shift));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t shift = (index % INT2_FOUR ) * 2;
        uint32_t idx = index / INT2_FOUR;
        uint8_t mask = (0x3) << shift;
        uint8_t maskedVal = tmp.GetValue(idx) & (~mask);
        tmp.SetValue(idx, maskedVal + (value.storage << shift));
#endif
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        LocalTensor<uint8_t> tmp = this->ReinterpretCast<uint8_t>();
        uint8_t mask = (index % ConstantsInternal::ASCENDC_B4_TWO == 0)? 0xf0 : 0xf;
        uint32_t idx = index / ConstantsInternal::ASCENDC_B4_TWO;
        uint8_t val = tmp.GetValue(idx) & mask;
        uint8_t shift = (index % ConstantsInternal::ASCENDC_B4_TWO == 0)? 0 : ConstantsInternal::ASCENDC_B4_BIT_NUM;
        tmp.SetValue(idx, val + (value << shift));
    } else if constexpr (SupportType<PrimType, complex32, complex64>()) {
        __ubuf__ PrimType* tmpAddr = reinterpret_cast<__ubuf__ T*>(static_cast<uint64_t>(this->address_.bufferAddr))
            + index;
        PrimType valueAux(value);
        *(reinterpret_cast<__ubuf__ typename PrimType::EleType*>(tmpAddr)) = valueAux.real;
        *(reinterpret_cast<__ubuf__ typename PrimType::EleType*>(tmpAddr) + 1) = valueAux.imag;
#endif
    } else {
        *(reinterpret_cast<__ubuf__ PrimType*>(static_cast<uint64_t>(this->address_.bufferAddr))
            + index) = static_cast<PrimType>(value);
    }
}

template <typename T> __aicore__ inline LocalTensor<T> LocalTensor<T>::operator[](const uint32_t offset) const
{
    LocalTensor result = *this;
    if constexpr (IsHalfByteDataType<PrimType>()) {
        result.address_.dataLen -= (offset / INT4_TWO);
        result.address_.bufferAddr = result.address_.bufferAddr + offset / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        result.address_.dataLen -= (offset / INT2_FOUR);
        result.address_.bufferAddr = result.address_.bufferAddr + offset / INT2_FOUR;
#endif
    } else {
        result.address_.dataLen -= (offset * sizeof(PrimType));
        result.address_.bufferAddr = result.address_.bufferAddr + offset * sizeof(PrimType);
    }
    return result;
}

template <typename T>
template <typename U>
[[deprecated("NOTICE: SetAddrWithOffset has been deprecated and will be removed in the next version. "
    "Please do not use it!")]]
__aicore__ inline void LocalTensor<T>::SetAddrWithOffset(LocalTensor<U> &src, uint32_t offset)
{
    this->address_ = src.address_;
    this->address_.bufferAddr += offset * sizeof(PrimT<U>);
}
#endif
template <typename T> __aicore__ inline int32_t LocalTensor<T>::GetPosition() const
{
    return this->address_.logicPos;
}

template <typename T> __aicore__ inline void LocalTensor<T>::SetSize(const uint32_t size)
{
#if ASCENDC_CPU_DEBUG
    uint32_t len = IsSameType<PrimType, int4b_t>::value ? size / INT4_TWO : size * sizeof(PrimType);
    ASCENDC_DEBUG_ASSERT(((this->address_.absAddr -
        (uint8_t*)(GetBaseAddrCpu(int8_t(AscendC::TPosition(this->address_.logicPos)))) + len) <=
        ConstDefiner::Instance().bufferInitLen.at(ConstDefiner::Instance().positionHardMap.at(
        AscendC::TPosition(this->address_.logicPos)))), KERNEL_LOG(KERNEL_ERROR,
                "Failed to check param size value in SetSize, current value is %d, buffer overflow", len));
#endif
    if constexpr (IsHalfByteDataType<PrimType>()) {
        this->address_.dataLen = size / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        this->address_.dataLen = size / INT2_FOUR;
#endif
    } else {
        this->address_.dataLen = size * sizeof(PrimType);
    }
}
template <typename T>
__aicore__ inline uint32_t LocalTensor<T>::GetSize() const
{
    if constexpr (IsHalfByteDataType<PrimType>()) {
        return this->address_.dataLen * INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (IsSameType<PrimType, int2b_t>::value) {
        return this->address_.dataLen * INT2_FOUR;
#endif
    } else {
        return this->address_.dataLen / sizeof(PrimType);
    }
}

template <typename T>
[[deprecated("NOTICE: GetLength has been deprecated and will be removed in the next version. Please do not use "
                "it!")]]
__aicore__ inline uint32_t LocalTensor<T>::GetLength() const
{
    return this->address_.dataLen;
}

template <typename T>
__aicore__ inline void LocalTensor<T>::SetBufferLen(uint32_t dataLen)
{
    this->address_.dataLen = dataLen;
#if defined(__NPU_ARCH__) && (__NPU_ARCH__ == 3101)
    if ASCEND_IS_AIV {
        this->address_.bufferAddr = set_ub_addr_upper_bound(this->address_.bufferAddr, dataLen * sizeof(T));
    }
#endif
}
template <typename T> __aicore__ inline void LocalTensor<T>::SetUserTag(const TTagType tag)
{
    auto ptr = reinterpret_cast<TBufType*>(this->address_.bufferHandle);
    ASCENDC_DEBUG_ASSERT((ptr != nullptr),
                    KERNEL_LOG(KERNEL_ERROR, "ptr can not be nullptr"));
    ptr->usertag = tag;
}
template <typename T> __aicore__ inline TTagType LocalTensor<T>::GetUserTag() const
{
    auto ptr = reinterpret_cast<TBufType*>(this->address_.bufferHandle);
    ASCENDC_DEBUG_ASSERT((ptr != nullptr),
                    KERNEL_LOG(KERNEL_ERROR, "ptr can not be nullptr"));
    return ptr->usertag;
}

template <typename T>
template <typename U>
__aicore__ inline void LocalTensor<T>::CreateTensor(AscendC::TPosition pos, uint32_t addr, uint32_t tileSize)
{
#if ASCENDC_CPU_DEBUG
    ASCENDC_DEBUG_ASSERT((pos != AscendC::TPosition::GM) && (pos != AscendC::TPosition::MAX),
        KERNEL_LOG(KERNEL_ERROR, "position input should not be GM or MAX"));
    AscendC::Hardware hardPos = ConstDefiner::Instance().positionHardMap.at(pos);
    uint32_t maxLen = ConstDefiner::Instance().bufferInitLen.at(hardPos);
    ASCENDC_DEBUG_ASSERT((addr % ONE_BLK_SIZE == 0),
            KERNEL_LOG(KERNEL_ERROR, "addr input is %u, which shoule be 32 bytes align", addr));
    if constexpr (IsHalfByteDataType<PrimType>()) {
        ASCENDC_DEBUG_ASSERT(((tileSize > 0) && ((tileSize % INT4_TWO) == 0) && (tileSize / INT4_TWO) <= maxLen),
            KERNEL_LOG(KERNEL_ERROR,
                "tensor size input is %u, which shoule be even number in range (0, %u]",
                tileSize, maxLen * INT4_TWO));
    } else {
        ASCENDC_DEBUG_ASSERT(((tileSize > 0) && (tileSize * sizeof(U)) <= maxLen),
            KERNEL_LOG(KERNEL_ERROR, "tensor size input is %u, which shoule be in range (0, %u]",
                tileSize, maxLen / sizeof(U)));
    }
    uint32_t tensorLength = tileSize * SizeOfBits<U>::value / SizeOfBits<uint8_t>::value;
    ASCENDC_DEBUG_ASSERT((tensorLength % ONE_BLK_SIZE == 0),
            KERNEL_LOG(KERNEL_ERROR, "tensor length is %u bytes, which shoule be 32 bytes align", tensorLength));
    ASCENDC_DEBUG_ASSERT(((addr >= 0) && ((addr + tensorLength) <= maxLen)),
        KERNEL_LOG(KERNEL_ERROR,
            "addr input is %u, tensor length is %u bytes, which exceeds max len %u bytes",
            addr,
            tensorLength,
            maxLen));
    this->address_.bufferHandle = nullptr;
    this->address_.absAddr = GetBaseAddrCpu(static_cast<int8_t>(pos)) + addr;
#endif
    this->address_.dataLen = SizeOfBits<U>::value * tileSize / SizeOfBits<uint8_t>::value;
    this->address_.bufferAddr = addr;
    this->address_.logicPos = static_cast<uint8_t>(pos);
}

template <typename T>
__aicore__ inline LocalTensor<T>::LocalTensor(AscendC::TPosition pos, uint32_t addr, uint32_t tileSize)
{
    static_assert(!is_tensorTrait_v<T>, "currently not support TensorTrait type!!");
    CreateTensor<T>(pos, addr, tileSize);
}

template <typename T>
__aicore__ inline LocalTensor<T>::LocalTensor(uint32_t addr)
{
    static_assert(is_tensorTrait_v<T>, "only support TensorTrait type Tensor!");
    static_assert((T::tPos != TPosition::GM) && (T::tPos != TPosition::MAX),
            "TensorTrait position should not be GM or MAX!");
    uint32_t tensorSize = Prod(this->GetTensorTrait().GetLayout().GetShape());
    CreateTensor<typename T::LiteType>(T::tPos, addr, tensorSize);
}

// symbol override
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideAdd<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideSub<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideMul<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideDiv<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideOr<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideAnd<T>& symbolOverride)
{
    symbolOverride.Process(*this);
}

#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T>
template <typename U>
__aicore__ inline void LocalTensor<T>::operator = (const SymbolOverrideCompare<U>& symbolOverride)
{
    symbolOverride.Process(*this);
}
#else
template <typename T> __aicore__ inline void
    LocalTensor<T>::operator = (const SymbolOverrideCompare<float>& symbolOverride)
{
    symbolOverride.Process(*this);
}
template <typename T> __aicore__ inline void
    LocalTensor<T>::operator = (const SymbolOverrideCompare<half>& symbolOverride)
{
    symbolOverride.Process(*this);
}
#endif

template <typename T> __aicore__ inline SymbolOverrideAdd<T>
    LocalTensor<T>::operator + (const LocalTensor<T>& src1) const
{
    return SymbolOverrideAdd<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideSub<T>
    LocalTensor<T>::operator - (const LocalTensor<T>& src1) const
{
    return SymbolOverrideSub<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideMul<T>
    LocalTensor<T>::operator *(const LocalTensor<T>& src1) const
{
    return SymbolOverrideMul<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideDiv<T>
    LocalTensor<T>::operator / (const LocalTensor<T>& src1) const
{
    return SymbolOverrideDiv<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideOr<T>
    LocalTensor<T>::operator | (const LocalTensor<T>& src1) const
{
    return SymbolOverrideOr<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideAnd<T>
    LocalTensor<T>::operator & (const LocalTensor<T>& src1) const
{
    return SymbolOverrideAnd<T>(*this, src1);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator < (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::LT);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator > (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::GT);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator != (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::NE);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator == (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::EQ);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator <= (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::LE);
}
template <typename T> __aicore__ inline SymbolOverrideCompare<T>
    LocalTensor<T>::operator >= (const LocalTensor<T>& src1) const
{
    return SymbolOverrideCompare<T>(*this, src1, CMPMODE::GE);
}
template <typename T> __aicore__ inline void
    LocalTensor<T>::SetShapeInfo(const ShapeInfo& shapeInfo)
{
    static_assert(IsSameType<T, PrimType>::value, "only primitive type Tensor has shape info!");
#ifndef SPLIT_CORE_CUBE
        shapeInfo_ = shapeInfo;
#endif
}
template <typename T> __aicore__ inline ShapeInfo LocalTensor<T>::GetShapeInfo() const
{
    static_assert(IsSameType<T, PrimType>::value, "only primitive type Tensor has shape info!");
#ifndef SPLIT_CORE_CUBE
        return shapeInfo_;
#else
        ShapeInfo tmp;
        return tmp;
#endif
}

template <typename T> __aicore__ inline void
    GlobalTensor<T>::SetGlobalBuffer(__gm__ typename GlobalTensor<T>::PrimType* buffer, uint64_t bufferSize)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    this->oriAddress_ = buffer;
    // For the case GlobalTensor set through KFC message, origin cacheMode is lost.
    if (this->cacheMode_ == CacheMode::CACHE_MODE_NORMAL) {
        this->cacheMode_ = ToCacheModeEnum(ExtractCacheMode(buffer));
        this->oriAddress_ = ExtractL2CacheGmAddr(buffer);
        this->address_ = buffer;
    } else {
        this->address_ = L2CacheAlter<PrimType, CacheRwMode::RW>(buffer, this->cacheMode_);
    }
#else
    if (this->cacheMode_ == CacheMode::CACHE_MODE_NORMAL) {
        this->address_ = buffer;
    } else {
        this->address_ = L2CacheAlter<PrimType, CacheRwMode::RW>(buffer, cacheMode_);
    }
    this->oriAddress_ = buffer;
#endif
    bufferSize_ = bufferSize;
}

template <typename T>
__aicore__ inline void GlobalTensor<T>::SetGlobalBuffer(__gm__ typename GlobalTensor<T>::PrimType* buffer)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    this->oriAddress_ = buffer;
    // For the case GlobalTensor set through KFC message, origin cacheMode is lost.
    if (this->cacheMode_ == CacheMode::CACHE_MODE_NORMAL) {
        this->cacheMode_ = ToCacheModeEnum(ExtractCacheMode(buffer));
        this->oriAddress_ = ExtractL2CacheGmAddr(buffer);
        this->address_ = buffer;
    } else {
        this->address_ = L2CacheAlter<PrimType, CacheRwMode::RW>(buffer, this->cacheMode_);
    }
#else
    if (this->cacheMode_ == CacheMode::CACHE_MODE_NORMAL) {
        this->address_ = buffer;
    } else {
        this->address_ = L2CacheAlter<PrimType, CacheRwMode::RW>(buffer, cacheMode_);
    }
    this->oriAddress_ = buffer;
#endif
#if ASCENDC_CPU_DEBUG
    bufferSize_ = 0;
#endif
}

template <typename T> __aicore__ inline
    const __gm__ typename GlobalTensor<T>::PrimType* GlobalTensor<T>::GetPhyAddr() const
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    return ExtractL2CacheGmAddr(this->address_);
#else
    return this->address_;
#endif
}

template <typename T> __aicore__ inline
    __gm__ typename GlobalTensor<T>::PrimType* GlobalTensor<T>::GetPhyAddr(const uint64_t offset) const
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        ASCENDC_DEBUG_ASSERT((offset % 2 == 0),
            KERNEL_LOG(KERNEL_ERROR, "The offset for int4b_t GetPhyAddr should be an even num."));
        return ExtractL2CacheGmAddr(this->address_ + offset / INT4_TWO);
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, int2b_t>()) {
        ASCENDC_DEBUG_ASSERT((offset % 4 == 0),
            KERNEL_LOG(KERNEL_ERROR, "The offset for int2b_t GetPhyAddr should be multiples of 4."));
        return ExtractL2CacheGmAddr(this->address_ + offset / INT2_FOUR);
#endif
    } else {
        return ExtractL2CacheGmAddr(this->address_ + offset);
    }
#else
    if constexpr (IsHalfByteDataType<PrimType>()) {
        ASCENDC_DEBUG_ASSERT((offset % 2 == 0),
            KERNEL_LOG(KERNEL_ERROR, "The offset for int4b_t GetPhyAddr should be an even num."));
        return this->address_ + offset / INT4_TWO;
    } else {
        return this->address_ + offset;
    }
#endif
}

#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
template <typename T>
template <typename U>
__aicore__ inline uintptr_t GlobalTensor<T>::AlignPtr(__gm__ U* buffer) const
{
    uintptr_t addr = (reinterpret_cast<uintptr_t>(buffer)) & (~0b111111);
    return addr;
}
#endif

#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
template <typename T> __aicore__ inline __inout_pipe__(S)
    typename GlobalTensor<T>::PrimType GlobalTensor<T>::GetValue(const uint64_t offset)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT4_TWO;
        return static_cast<T>((*addr) >> (INT4_BIT_NUM * (offset % INT4_TWO)));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, int2b_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT2_FOUR;
        return static_cast<T>((*addr) >> (INT2_BIT_NUM * (offset % INT2_FOUR)));
#endif
    } else {
        __gm__ T* addr = ExtractL2CacheGmAddr(this->address_ + offset);
        if constexpr (SupportType<PrimType, complex32, complex64>()) {
            return T(addr->real, addr->imag);
        } else {
            return *addr;
        }
    }
#else
    if constexpr (IsHalfByteDataType<PrimType>()) {
        __gm__ uint8_t *addr = reinterpret_cast<__gm__ uint8_t *>(this->oriAddress_) + offset / INT4_TWO;
        uintptr_t tmpAddr = AlignPtr<uint8_t>(addr);
        if (tmpAddr != lastReadCacheAddr) {
            lastReadCacheAddr = tmpAddr;
            dcci(reinterpret_cast<__gm__ uint64_t *>(lastReadCacheAddr),
                cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
        }
        return static_cast<PrimType>((*addr) >> (INT4_BIT_NUM * (offset % INT4_TWO)));
    } else {
        uintptr_t tmpAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
        if (tmpAddr != lastReadCacheAddr) {
            lastReadCacheAddr = tmpAddr;
            dcci(reinterpret_cast<__gm__ uint64_t *>(lastReadCacheAddr),
                cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
        }
        return this->oriAddress_[offset];
    }
#endif
}
#endif

template <typename T> __aicore__ inline __inout_pipe__(S)
    typename GlobalTensor<T>::PrimType GlobalTensor<T>::GetValue(const uint64_t offset) const
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT4_TWO;
        return static_cast<T>((*addr) >> (INT4_BIT_NUM * (offset % INT4_TWO)));
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, int2b_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT2_FOUR;
        return static_cast<T>((*addr) >> (INT2_BIT_NUM * (offset % INT2_FOUR)));
#endif
    } else {
        __gm__ T* addr = ExtractL2CacheGmAddr(this->address_ + offset);
        if constexpr (SupportType<PrimType, complex32, complex64>()) {
            return T(addr->real, addr->imag);
        } else {
            return *addr;
        }
    }
#else
    if constexpr (IsHalfByteDataType<PrimType>()) {
        __gm__ uint8_t *addr = reinterpret_cast<__gm__ uint8_t *>(this->oriAddress_) + offset / INT4_TWO;
#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
        uintptr_t tmpAddr = AlignPtr<uint8_t>(addr);
        dcci(reinterpret_cast<__gm__ uint64_t *>(tmpAddr),
                cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
#endif
        return static_cast<PrimType>((*addr) >> (INT4_BIT_NUM * (offset % INT4_TWO)));
    } else {
#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
        uintptr_t tmpAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
        dcci(reinterpret_cast<__gm__ uint64_t *>(tmpAddr),
                cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
#endif
        return this->oriAddress_[offset];
    }
#endif
}

#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
template <typename T> __aicore__ inline __inout_pipe__(S)
    __gm__ typename GlobalTensor<T>::PrimType& GlobalTensor<T>::operator()(const uint64_t offset)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    __gm__ PrimType* addr = ExtractL2CacheGmAddr(this->address_);
    return addr[offset];
#else
    uintptr_t tmpAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
    if (tmpAddr != lastReadCacheAddr) {
        lastReadCacheAddr = tmpAddr;
        dcci(reinterpret_cast<__gm__ uint64_t *>(lastReadCacheAddr),
            cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
    }
    return this->oriAddress_[offset];
#endif
}
#endif

template <typename T> __aicore__ inline __inout_pipe__(S)
    __gm__ typename GlobalTensor<T>::PrimType& GlobalTensor<T>::operator()(const uint64_t offset) const

{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    __gm__ PrimType* addr = ExtractL2CacheGmAddr(this->address_);
    return addr[offset];
#else
#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
    uintptr_t tmpAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
    dcci(reinterpret_cast<__gm__ uint64_t *>(tmpAddr),
        cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
#endif
    return this->oriAddress_[offset];
#endif
}

template <typename T> __aicore__ inline
    void GlobalTensor<T>::SetValue(const uint64_t offset, typename GlobalTensor<T>::PrimType value)
{
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    if constexpr (SupportType<PrimType, int4b_t, fp4x2_e2m1_t, fp4x2_e1m2_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT4_TWO;
        uint8_t mask = (offset % INT4_TWO == 0)? 0xf0 : 0xf;

        uint8_t val = (*addr) & mask;
        uint8_t shift = (offset % INT4_TWO == 0)? 0 : INT4_BIT_NUM;
        *addr = val + (value.storage << shift);
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, int2b_t>()) {
        __gm__ uint8_t *addr =
            reinterpret_cast<__gm__ uint8_t *>(ExtractL2CacheGmAddr(this->address_)) + offset / INT2_FOUR;

        uint8_t shift = (offset % INT2_FOUR ) * 2;
        uint8_t mask = (0x3) << shift;
        uint8_t maskedVal = (*addr) & (~mask);
        *addr = maskedVal + (value.storage << shift);
#endif
    } else {
        __gm__ PrimType* addr = ExtractL2CacheGmAddr(this->address_ + offset);
        *addr = value;
    }
#else
    if constexpr (IsHalfByteDataType<PrimType>()) {
        __gm__ uint8_t *addr = reinterpret_cast<__gm__ uint8_t *>(this->oriAddress_) + offset / INT4_TWO;
        uint8_t mask = (offset % INT4_TWO == 0)? 0xf0 : 0xf;

#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
        if (lastWriteCacheAddr == 0) {
            lastWriteCacheAddr = AlignPtr<uint8_t>(addr);
        } else {
            uintptr_t tmpAddr = AlignPtr<uint8_t>(addr);
            if (tmpAddr != lastWriteCacheAddr ) {
                dcci(reinterpret_cast<__gm__ uint64_t *>(lastWriteCacheAddr),
                    cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
                lastWriteCacheAddr = tmpAddr;
            }
        }
#endif
        uint8_t val = (*addr) & mask;
        uint8_t shift = (offset % INT4_TWO == 0)? 0 : INT4_BIT_NUM;
        *addr = val + (value.storage << shift);
    } else {
#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
        if (lastWriteCacheAddr == 0) {
            lastWriteCacheAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
        } else {
            uintptr_t tmpAddr = AlignPtr<PrimType>(this->oriAddress_ + offset);
            if (tmpAddr != lastWriteCacheAddr) {
                dcci(reinterpret_cast<__gm__ uint64_t *>(lastWriteCacheAddr),
                    cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
                lastWriteCacheAddr = tmpAddr;
            }
        }
#endif
        this->oriAddress_[offset] = value;
    }
#endif
}

template <typename T> __aicore__ inline GlobalTensor<T>::GlobalTensor()
{
#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
    this->lastWriteCacheAddr = 0;
    this->lastReadCacheAddr = 0;
#endif
}

#ifdef __ASCENDC_ENABLE_SUPER_KERNEL__
template <typename T> __aicore__ inline GlobalTensor<T>::~GlobalTensor()
{
    if (lastWriteCacheAddr != 0) {
        dcci(reinterpret_cast<__gm__ uint64_t *>(lastWriteCacheAddr),
            cache_line_t::SINGLE_CACHE_LINE, dcci_dst_t::CACHELINE_OUT);
    }
}
#endif

template <typename T> __aicore__ inline uint64_t GlobalTensor<T>::GetSize() const
{
    return bufferSize_;
}

template <typename T> __aicore__ inline GlobalTensor<T> GlobalTensor<T>::operator[](const uint64_t offset) const
{
    GlobalTensor result = *this;
    if constexpr (IsHalfByteDataType<PrimType>()) {
        result.address_ = result.address_ + offset / INT4_TWO;
        result.oriAddress_ = result.oriAddress_ + offset / INT4_TWO;
#if (__NPU_ARCH__ == 5102)
    } else if constexpr (SupportType<PrimType, int2b_t>()) {
        result.address_ = result.address_ + offset / INT2_FOUR;
        result.oriAddress_ = result.oriAddress_ + offset / INT2_FOUR;
#endif
    } else {
        result.address_ = result.address_ + offset;
        result.oriAddress_ = result.oriAddress_ + offset;
    }
    return result;
}

template <typename T> __aicore__ inline void GlobalTensor<T>::SetShapeInfo(const ShapeInfo& shapeInfo)
{
    static_assert(IsSameType<T, PrimType>::value, "only primitive type Tensor has shape info!");
#ifndef SPLIT_CORE_CUBE
    shapeInfo_ = shapeInfo;
#endif
}

template <typename T> __aicore__ inline ShapeInfo GlobalTensor<T>::GetShapeInfo() const
{
    static_assert(IsSameType<T, PrimType>::value, "only primitive type Tensor has shape info!");
#ifndef SPLIT_CORE_CUBE
    return shapeInfo_;
#else
    ShapeInfo tmp;
    return tmp;
#endif
}

template <typename T>
template<CacheRwMode rwMode>
__aicore__ inline void GlobalTensor<T>::SetL2CacheHint(CacheMode mode) {
    this->cacheMode_ = mode;
#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
    this->address_ = L2CacheAlter<PrimType, rwMode>(this->address_, mode);
#else
    if (mode == CacheMode::CACHE_MODE_NORMAL) {
        this->address_ = this->oriAddress_;
    } else {
        this->address_ = L2CacheAlter<PrimType, rwMode>(this->oriAddress_, mode);
    }
#if defined(ASCENDC_OOM) && ASCENDC_OOM == 1
    AscendC::OOMAddAddrForL2Cache<PrimType>(this->address_, this->oriAddress_);
#endif // defined(ASCENDC_OOM) && ASCENDC_OOM == 1
#endif
}

#if (__NPU_ARCH__ == 3101) || (__NPU_ARCH__ == 5102)
template <typename T>
template <typename U> __aicore__ inline GlobalTensor<U> GlobalTensor<T>::ReinterpretCast() const
{
    GlobalTensor<U> output;
    output.address_ = reinterpret_cast<__gm__ U*>(reinterpret_cast<__gm__ int64_t*>(this->address_));
    output.oriAddress_ = reinterpret_cast<__gm__ U*>(reinterpret_cast<__gm__ int64_t*>(this->oriAddress_));
    output.bufferSize_ = this->GetSize() * SizeOfBits<T>::value / SizeOfBits<U>::value;
    output.cacheMode_ = this->cacheMode_;
#ifndef SPLIT_CORE_CUBE
    output.shapeInfo_ = this->shapeInfo_;
#endif
    return output;
}
#endif

template <Hardware hard>
__aicore__ inline LocalMemAllocator<hard>::LocalMemAllocator()
{
    static_assert((hard != Hardware::GM) && (hard != Hardware::MAX) && "illegal hardware position GM or MAX");
#if defined(ASCENDC_CPU_DEBUG) && (ASCENDC_CPU_DEBUG == 1)
    ASCENDC_DEBUG_ASSERT(ConstDefiner::Instance().CheckAllocatorUsed(hard) != true,
        KERNEL_LOG(
            KERNEL_ERROR, "only one LocalMemAllocator can exist at the same hardware position at any given time."));
#endif
}

template <Hardware hard>
__aicore__ inline uint32_t LocalMemAllocator<hard>::GetCurAddr() const
{
    return head_;
}

template <Hardware hard>
template <TPosition pos, class DataType, uint32_t tileSize>
__aicore__ inline LocalTensor<DataType> LocalMemAllocator<hard>::Alloc()
{
    static_assert(!is_tensorTrait_v<DataType>, "currently not support TensorTrait type!");
    static_assert(GetPhyType(pos) == hard, "logic pos and hardware pos not matched.");
    LocalTensor<DataType> output(pos, head_, tileSize);
    head_ += SizeOfBits<DataType>::value * tileSize / SizeOfBits<uint8_t>::value;
    return output;
}

template <Hardware hard>
template <class DataType, uint32_t tileSize>
__aicore__ inline LocalTensor<DataType> LocalMemAllocator<hard>::Alloc()
{
    static_assert(!is_tensorTrait_v<DataType>, "currently not support TensorTrait type!");
    LocalTensor<DataType> output(GetDefaultPosition(hard), head_, tileSize);
    head_ += SizeOfBits<DataType>::value * tileSize / SizeOfBits<uint8_t>::value;
    return output;
}

template <Hardware hard>
template <TPosition pos, class DataType>
__aicore__ inline LocalTensor<DataType> LocalMemAllocator<hard>::Alloc(uint32_t tileSize)
{
    static_assert(!is_tensorTrait_v<DataType>, "currently not support TensorTrait type!");
    static_assert(GetPhyType(pos) == hard, "logic pos and hardware pos not matched.");
    LocalTensor<DataType> output(pos, head_, tileSize);
    head_ += SizeOfBits<DataType>::value * tileSize / SizeOfBits<uint8_t>::value;
    return output;
}

template <Hardware hard>
template <class DataType>
__aicore__ inline LocalTensor<DataType> LocalMemAllocator<hard>::Alloc(uint32_t tileSize)
{
    static_assert(!is_tensorTrait_v<DataType>, "currently not support TensorTrait type!");
    LocalTensor<DataType> output(GetDefaultPosition(hard), head_, tileSize);
    head_ += SizeOfBits<DataType>::value * tileSize / SizeOfBits<uint8_t>::value;
    return output;
}

template <Hardware hard>
template <class TensorTraitType>
__aicore__ inline LocalTensor<TensorTraitType> LocalMemAllocator<hard>::Alloc()
{
    static_assert(is_tensorTrait_v<TensorTraitType>, "only support TensorTrait type!");
    static_assert(GetPhyType(TensorTraitType::tPos) == hard, "logic pos and hardware pos not matched.");
    LocalTensor<TensorTraitType> output(head_);
    head_ += SizeOfBits<typename TensorTraitType::LiteType>::value * output.GetSize() / SizeOfBits<uint8_t>::value;
    return output;
}
}
#endif // KERNEL_TENSOR_H
